{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import Series, DataFrame"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0         ﻿The\n",
       "1      Project\n",
       "2    Gutenberg\n",
       "3        EBook\n",
       "4           of\n",
       "dtype: object"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "filename = '../data/alice-in-wonderland.txt'\n",
    "\n",
    "s = Series(open(filename).read().split())\n",
    "s.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 1\n",
    "\n",
    "What is the mean of all integers in Alice?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "8030.851851851852"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import string\n",
    "\n",
    "(\n",
    "    s\n",
    "    .str\n",
    "    .strip(string.punctuation)\n",
    "    .loc[lambda s_: s_.str.isdigit()]\n",
    "    .astype(int)\n",
    "    .mean()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 2\n",
    "\n",
    "What words in Alice don't appear in the dictionary? Which are the five most common such words?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "words = {one_word.strip() for one_word in open('../data/words.txt')}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Project      83\n",
       "She          36\n",
       "Rabbit       28\n",
       "Queen        27\n",
       "Gutenberg    27\n",
       "             ..\n",
       "reasons       1\n",
       "knocked       1\n",
       "curls         1\n",
       "From          1\n",
       "includes      1\n",
       "Name: count, Length: 758, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "(\n",
    "    s\n",
    "    .str.strip(string.punctuation)      # Strip punctuation\n",
    "    .loc[lambda s_: s_.str.isalpha()]   # Keep only those with letters\n",
    "    .loc[lambda s_: ~s_.isin(words)]    # Now keep those *not* in the dictionary, and find the most common ones\n",
    "    .value_counts()\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Beyond 3\n",
    "\n",
    "What is the mean number of words per paragraph?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "count    393.000000\n",
       "mean      32.475827\n",
       "std       32.428415\n",
       "min        0.000000\n",
       "25%        7.000000\n",
       "50%       22.000000\n",
       "75%       48.000000\n",
       "max      169.000000\n",
       "dtype: float64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Read the file into a series by paragraph\n",
    "s = Series(open(filename).read().split('\\n\\n'))\n",
    "\n",
    "# Just use describe to get min, max, and everything else\n",
    "(\n",
    "    s\n",
    "    .str\n",
    "    .split()\n",
    "    .str\n",
    "    .len()\n",
    "    .describe() \n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
